PCA for wecare-only (b37) wes WECARE+NFE dataset w/o Danish:
To clarify
An e-mail was sent to bigsnpr package author:
The answer was qiock but poorly worded (intended to show off rather than to clarify):
https://github.com/privefl/bigstatsr/issues/128#issuecomment-768277757
References
# Time
Sys.time()
## [1] "2021-01-31 14:05:29 GMT"
# Memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 411961 22.1 856251 45.8 NA 641560 34.3
## Vcells 790279 6.1 8388608 64.0 16384 1768509 13.5
# Clean up
rm(list=ls())
graphics.off()
# Options
options(stringsAsFactors = F)
# Folders
base_folder <- "/Users/alexey/Documents"
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021","s02_wes_wecare_nfe")
scripts_folder <- file.path(project_folder,"scripts","s07_relatedness_and_pca")
setwd(scripts_folder)
data_folder <- file.path(project_folder,"data","s07_relatedness_and_pca")
# Libraries
library(bigsnpr) # for bed_autoSVD() and bed()
## Loading required package: bigstatsr
library(bigutilsr) # for prob_dist() and tukey_mc_up() for outlier detection
library(hexbin) # for plotting svd loadings
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
NCORES <- 1
#NCORES <- nb_cores() # 2
# Location of bed file
bed_file <- file.path(data_folder,"s03_non_related","common_biallelic_autosomal_snps_in_HWE_norel.bed")
# Attach PLINK data to R environment
wecare.bed <- bed(bed_file) # bigsnpr::bed
# Explore wecare.bed
wecare.bed
## A 'bed' object with 531 samples and 18628 variants.
names(wecare.bed)
## [1] ".->bedfile" "light" ".self" ".->.map" "show" "extptr" "nrow" "bimfile" ".refClassDef" "bedfile" "ncol" "initialize" "map" "fam" ".map" ".->extptr" "famfile" ".fam" "address" ".->.fam" "prefix"
#attributes(wecare.bed)
#str(wecare.bed)
#wecare.bed$bedfile
#wecare.bed$address
# Clean-up
rm(bed_file)
# Phenotypes from plink
wecare_fam.df <- wecare.bed$fam
dim(wecare_fam.df)
## [1] 531 6
head(wecare_fam.df)
## family.ID sample.ID paternal.ID maternal.ID sex affection
## 1 HG00097 HG00097 0 0 2 -9
## 2 HG00099 HG00099 0 0 2 -9
## 3 HG00100 HG00100 0 0 2 -9
## 4 HG00102 HG00102 0 0 2 -9
## 5 HG00106 HG00106 0 0 2 -9
## 6 HG00110 HG00110 0 0 2 -9
table(wecare_fam.df$affection, useNA = "always")
##
## -9 1 2 <NA>
## 196 172 163 0
# Phenotypes from the main dataset file
load(file.path(project_folder,"data","s06_qc_filters","s04_filter_by_sample_call_rates.RData"))
rm(genotypes.mx,variants.df)
# Update folders
base_folder <- "/Users/alexey/Documents"
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021","s02_wes_wecare_nfe")
scripts_folder <- file.path(project_folder,"scripts","s07_relatedness_and_pca")
data_folder <- file.path(project_folder,"data","s07_relatedness_and_pca")
dim(phenotypes.df)
## [1] 532 40
str(phenotypes.df)
## 'data.frame': 532 obs. of 40 variables:
## $ wes_id : chr "HG00097" "HG00099" "HG00100" "HG00102" ...
## $ gwas_id : chr NA NA NA NA ...
## $ merged_id : chr NA NA NA NA ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ setno : int NA NA NA NA NA NA NA NA NA NA ...
## $ registry : chr NA NA NA NA ...
## $ family_history: int NA NA NA NA NA NA NA NA NA NA ...
## $ age_dx : int NA NA NA NA NA NA NA NA NA NA ...
## $ age_ref : int NA NA NA NA NA NA NA NA NA NA ...
## $ rstime : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig1_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig2_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig3_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig4_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig5_gwas : num NA NA NA NA NA NA NA NA NA NA ...
## $ stage : int NA NA NA NA NA NA NA NA NA NA ...
## $ er : int NA NA NA NA NA NA NA NA NA NA ...
## $ pr : int NA NA NA NA NA NA NA NA NA NA ...
## $ hist_cat : chr NA NA NA NA ...
## $ hormone : int NA NA NA NA NA NA NA NA NA NA ...
## $ chemo_cat : chr NA NA NA NA ...
## $ br_xray : int NA NA NA NA NA NA NA NA NA NA ...
## $ br_xray_dose : num NA NA NA NA NA NA NA NA NA NA ...
## $ age_menarche : int NA NA NA NA NA NA NA NA NA NA ...
## $ age_1st_ftp : int NA NA NA NA NA NA NA NA NA NA ...
## $ age_menopause : int NA NA NA NA NA NA NA NA NA NA ...
## $ num_preg : int NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_age18 : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_dx : num NA NA NA NA NA NA NA NA NA NA ...
## $ bmi_ref : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig1_wecare : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig2_wecare : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig3_wecare : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig4_wecare : num NA NA NA NA NA NA NA NA NA NA ...
## $ eig5_wecare : num NA NA NA NA NA NA NA NA NA NA ...
## $ Batch_ID : chr NA NA NA NA ...
## $ Subject_ID : int NA NA NA NA NA NA NA NA NA NA ...
## $ Barcode : chr NA NA NA NA ...
## $ danish : logi NA NA NA NA NA NA ...
table(phenotypes.df$cc, useNA = "always")
##
## -1 0 1 <NA>
## 197 172 163 0
table(phenotypes.df$filter, useNA = "always")
##
## eigenvectors_outlier pass <NA>
## 2 530 0
#phenotypes.df[phenotypes.df$filter=="eigenvectors_outlier","wes_id"]
#"Possibly_related_to_P1-D05" -> phenotypes.df[phenotypes.df$wes_id=="P5_C12","filter"]
# Merge fam-file and phenotypes from the main dataset (removing samples that are not in fam-file)
joined_phenotypes.df <- left_join(wecare_fam.df, phenotypes.df,
by=c("sample.ID"="wes_id"))
dim(joined_phenotypes.df)
## [1] 531 45
colnames(joined_phenotypes.df)
## [1] "family.ID" "sample.ID" "paternal.ID" "maternal.ID" "sex" "affection" "gwas_id" "merged_id" "filter" "cc" "setno" "registry" "family_history" "age_dx" "age_ref" "rstime" "eig1_gwas" "eig2_gwas" "eig3_gwas" "eig4_gwas" "eig5_gwas" "stage" "er" "pr" "hist_cat" "hormone" "chemo_cat" "br_xray" "br_xray_dose" "age_menarche" "age_1st_ftp" "age_menopause" "num_preg" "bmi_age18" "bmi_dx" "bmi_ref" "eig1_wecare" "eig2_wecare" "eig3_wecare" "eig4_wecare" "eig5_wecare" "Batch_ID" "Subject_ID" "Barcode" "danish"
table(joined_phenotypes.df$filter, useNA = "always")
##
## eigenvectors_outlier pass <NA>
## 2 529 0
table(joined_phenotypes.df$cc, useNA = "always")
##
## -1 0 1 <NA>
## 196 172 163 0
table(joined_phenotypes.df$affection, useNA = "always")
##
## -9 1 2 <NA>
## 196 172 163 0
#sum(joined_phenotypes.df$sample.ID == "P5_C12")
# Make sure that dplyr::left_joint hasnt changed the order of samples
sum(substr(joined_phenotypes.df$merged_id,1,6) != wecare_fam.df$sample.ID,na.rm = T)
## [1] 0
# Add column for PCA outliers ?
joined_phenotypes.df <- data.frame(joined_phenotypes.df, pca_outlier=F)
# Clean-up
rm(phenotypes.df, wecare_fam.df)
Data from bed-bim-fam only: explored; is it needed for calculation??
# map file
wecare_map.df <- wecare.bed$map
dim(wecare_map.df)
## [1] 18628 6
head(wecare_map.df)
## chromosome marker.ID genetic.dist physical.pos allele1 allele2
## 1 1 1_881627_G_A 0 881627 G A
## 2 1 1_897325_G_C 0 897325 G C
## 3 1 1_897738_C_T 0 897738 T C
## 4 1 1_982941_T_C 0 982941 T C
## 5 1 1_982994_T_C 0 982994 T C
## 6 1 1_987200_C_T 0 987200 C T
# make simple counts
wecare_maf.df <- bed_MAF(wecare.bed)
dim(wecare_maf.df)
## [1] 18628 5
head(wecare_maf.df)
## ac mac af maf N
## 1 326 326 0.33887734 0.33887734 481
## 2 63 63 0.06745182 0.06745182 467
## 3 60 60 0.06479482 0.06479482 463
## 4 93 93 0.09011628 0.09011628 516
## 5 92 92 0.08829175 0.08829175 521
## 6 104 104 0.09942639 0.09942639 523
# merge map file with the counts
joined_variants.df <- cbind(wecare_map.df,wecare_maf.df)
dim(joined_variants.df)
## [1] 18628 11
head(joined_variants.df)
## chromosome marker.ID genetic.dist physical.pos allele1 allele2 ac mac af maf N
## 1 1 1_881627_G_A 0 881627 G A 326 326 0.33887734 0.33887734 481
## 2 1 1_897325_G_C 0 897325 G C 63 63 0.06745182 0.06745182 467
## 3 1 1_897738_C_T 0 897738 T C 60 60 0.06479482 0.06479482 463
## 4 1 1_982941_T_C 0 982941 T C 93 93 0.09011628 0.09011628 516
## 5 1 1_982994_T_C 0 982994 T C 92 92 0.08829175 0.08829175 521
## 6 1 1_987200_C_T 0 987200 C T 104 104 0.09942639 0.09942639 523
# Variants with AF(ref) < AF(alt)
inverted <- joined_variants.df$ac != joined_variants.df$mac
sum(inverted)
## [1] 5
joined_variants.df[inverted,]
## chromosome marker.ID genetic.dist physical.pos allele1 allele2 ac mac af maf N
## 2594 2 2_101010082_G_C 0 101010082 G C 513 511 0.5009766 0.4990234 512
## 3101 2 2_215855780_G_A 0 215855780 G A 527 525 0.5009506 0.4990494 526
## 6368 6 6_28542520_C_T 0 28542520 C T 510 508 0.5009823 0.4990177 509
## 7057 6 6_97613163_T_C 0 97613163 C T 524 522 0.5009560 0.4990440 523
## 7431 6 6_166579270_C_T 0 166579270 C T 512 510 0.5009785 0.4990215 511
# Clean-up
rm(wecare_map.df, wecare_maf.df, inverted)
Could be sequentially repeated if outliers are detected
Takes care about LD etc.
See ?plot.big_SVD for plotting svd objets.
# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers1 <- which(!joined_phenotypes.df$pca_outlier)
length(non_outliers1)
## [1] 531
# bigsnpr::bed_autoSVD, Default k = 10
#using non-outlier samples (ind.row) and all variants (ind.col)
#table(wecare.bed$map$chromosome) - if complains abotut non-numeric chromosomes
wecare.svd1 <- bed_autoSVD(wecare.bed,
ind.row=non_outliers1,
ncores = NCORES)
##
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 9707 variants.
## Discarding 0 variant with MAC < 10.
##
## Iteration 1:
## Computing SVD..
## 132 outlier variants detected..
## 1 long-range LD region detected..
##
## Iteration 2:
## Computing SVD..
## 0 outlier variant detected..
##
## Converged!
#ind.col=vars_not_in_LD,
# Variants not in LD (detected by clumping during autoSVD)
vars_not_in_LD1 <- attr(wecare.svd1, "subset")
length(vars_not_in_LD1)
## [1] 9575
#attributes(wecare.svd)
str(wecare.svd1)
## List of 7
## $ d : num [1:10] 159 140 126 120 120 ...
## $ u : num [1:531, 1:10] 0.01862 0.04007 0.04438 0.04531 0.00228 ...
## $ v : num [1:9575, 1:10] -0.00744 -0.00137 -0.00804 -0.00855 0.00478 ...
## $ niter : num 16
## $ nops : num 248
## $ center: num [1:9575] 0.678 0.135 0.199 0.198 0.425 ...
## $ scale : num [1:9575] 0.669 0.355 0.423 0.422 0.579 ...
## - attr(*, "class")= chr "big_SVD"
## - attr(*, "subset")= int [1:9575] 1 2 6 7 9 11 12 13 15 16 ...
## - attr(*, "lrldr")='data.frame': 1 obs. of 3 variables:
## ..$ Chr : num 6
## ..$ Start: num 33648228
## ..$ Stop : num 55266625
# Eigenvalues
length(wecare.svd1$d)
## [1] 10
wecare.svd1$d
## [1] 159.4512 139.5095 126.1602 120.4182 120.0438 119.5141 119.2688 119.1245 118.7687 118.3971
plot(wecare.svd1) # default type="screeplot" see ?plot.big_SVD
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?
# Loadings
dim(wecare.svd1$v)
## [1] 9575 10
head(wecare.svd1$v)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -0.007442316 0.003834898 -0.0004928298 -0.011781062 -0.016935819 0.004184202 0.003527457 -0.006785065 -0.012506751 0.018771114
## [2,] -0.001368639 0.004052780 -0.0082892737 0.002937288 -0.010744252 0.013751571 0.002355065 -0.005999013 -0.024960317 0.008184192
## [3,] -0.008036731 -0.001822211 -0.0116159936 -0.005933570 0.003520575 -0.027334667 -0.001172901 0.005634025 0.001903181 -0.002592940
## [4,] -0.008545381 0.005182089 0.0117814320 0.001751618 0.015245286 -0.021063106 0.004673890 -0.012092966 0.011323376 0.002888324
## [5,] 0.004776277 -0.023937956 -0.0073829825 0.001888633 0.006850649 0.004019456 -0.007861765 -0.004675818 0.005442361 -0.003333888
## [6,] -0.011186694 -0.021340043 0.0232951518 0.007873729 -0.000140632 -0.011679733 -0.003482157 -0.015028438 -0.002562494 -0.014964663
# Loadings summary (for PCs from 1 to 10)
plot(wecare.svd1,type="loadings",loadings=1:10,coeff=0.4)
# Eigenvectors
dim(wecare.svd1$u)
## [1] 531 10
head(wecare.svd1$u)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0.018622738 0.019854709 0.021727834 0.040897274 -0.023522635 -0.12298425 0.013204443 0.016949692 0.052266173 0.005252450
## [2,] 0.040068650 -0.030401205 -0.007037642 -0.005381513 0.063195958 -0.03780878 -0.004445595 -0.058644607 0.013258496 0.056564389
## [3,] 0.044375826 0.005365033 0.005296443 -0.041839396 0.007915045 -0.03751976 0.015726508 0.005636768 0.048383012 0.020846493
## [4,] 0.045311987 0.022983335 0.018884413 0.028688205 -0.017291190 -0.08735035 0.056054534 0.044640043 0.013983675 -0.022166526
## [5,] 0.002278271 0.006678138 0.048083894 -0.010012405 -0.025999349 0.03106204 0.093311736 -0.080248583 -0.009133683 0.049155780
## [6,] 0.036247260 0.007615541 0.087170557 0.087995074 -0.004003836 0.03703239 0.028481750 -0.012679235 -0.115985155 -0.009126424
# PCA summary (for PCs from 1 to 10)
plot(wecare.svd1,type = "scores",scores=1:10,coeff=0.4)
# Calculate a measure of "outlieness"
U1 <- wecare.svd1$u
prob1 <- prob_dist(U1, ncores=NCORES) # bigutilsr::prob_dist
S1 <- prob1$dist.self / sqrt(prob1$dist.nn)
tukey_threshold1 <- tukey_mc_up(S1) # bigutilsr::tukey_mc_up
# Outliers
outliers1 <- S1 >= tukey_threshold1
sum(outliers1)
## [1] 3
outliers_id1 <- wecare.bed$fam$sample.ID[S1 >= tukey_threshold1]
outliers_id1
## [1] "P1_E06" "P2_C12" "P5_E09"
# Histogram by "outlieness" score
ggplot() +
geom_histogram(aes(S1), color = "black", fill = "blue", alpha = 0.3) +
theme_bigstatsr() +
geom_vline(xintercept=tukey_threshold1, colour="red") +
labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Location of outlier(s) in PCA plots
plot(U1[, 1:2], col = (S1 > tukey_threshold1) + 1, pch = 20)
plot(U1[, 3:4], col = (S1 > tukey_threshold1) + 1, pch = 20)
plot(U1[, 5:6], col = (S1 > tukey_threshold1) + 1, pch = 20)
# Add outlier to the phenotypes data frame
joined_phenotypes.df$pca_outlier <- joined_phenotypes.df$pca_outlier |
joined_phenotypes.df$sample.ID %in% outliers_id1
sum(joined_phenotypes.df$pca_outlier)
## [1] 3
# Clean-up
rm(non_outliers1,U1,prob1,S1,tukey_threshold1,
outliers1, wecare.svd1, outliers_id1) #vars_not_in_LD1,
Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up
https://privefl.github.io/bigsnpr/articles/bedpca.html
# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers2 <- which(!joined_phenotypes.df$pca_outlier)
length(non_outliers2)
## [1] 528
# Calculate PCA
wecare.svd2 <- bed_autoSVD(wecare.bed,
ind.row=non_outliers2,
ncores = NCORES)
##
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 9704 variants.
## Discarding 0 variant with MAC < 10.
##
## Iteration 1:
## Computing SVD..
## 131 outlier variants detected..
## 1 long-range LD region detected..
##
## Iteration 2:
## Computing SVD..
## 0 outlier variant detected..
##
## Converged!
# ind.col=vars_not_in_LD1 - removes the outlier
# Variants not in LD (detected by clumping during autoSVD)
vars_not_in_LD2 <- attr(wecare.svd2, "subset")
length(vars_not_in_LD2)
## [1] 9573
length(vars_not_in_LD1)
## [1] 9575
# Explore PCA results
plot(wecare.svd2)
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?
plot(wecare.svd2, type = "loadings", loadings=1:10, coeff=0.4)
plot(wecare.svd2, type = "scores", scores=1:10, coeff=0.4)
# Calculate a measure of "outlieness"
U2 <- wecare.svd2$u
prob2 <- prob_dist(U2, ncores=NCORES) # bigutilsr::prob_dist
S2 <- prob2$dist.self / sqrt(prob2$dist.nn)
tukey_threshold2 <- tukey_mc_up(S2) # bigutilsr::tukey_mc_up
# Outliers
outliers2 <- S2 >= tukey_threshold2
sum(outliers2)
## [1] 0
#outliers_id2 <- wecare.bed$fam$sample.ID[S2 >= tukey_threshold2]
#outliers_id2
# Histogram by "outlieness" score
ggplot() +
geom_histogram(aes(S2), color = "black", fill = "blue", alpha = 0.3) +
theme_bigstatsr() +
geom_vline(xintercept=tukey_threshold2, colour="red") +
labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Location of outlier(s) in PCA plots
#plot(U2[, 1:2], col = (S2 > tukey_threshold2) + 1, pch = 20)
#plot(U2[, 3:4], col = (S2 > tukey_threshold2) + 1, pch = 20)
#plot(U2[, 5:6], col = (S2 > tukey_threshold2) + 1, pch = 20)
# Add outlier to the phenotypes data frame
#joined_phenotypes.df$pca_outlier <- joined_phenotypes.df$pca_outlier |
# joined_phenotypes.df$sample.ID %in% outliers_id2
#sum(joined_phenotypes.df$pca_outlier)
# Clean-up
rm(non_outliers2,U2,prob2,S2,tukey_threshold2,outliers2,
vars_not_in_LD1,vars_not_in_LD2) # vars_not_in_LD2, outliers_id2,wecare.svd2
Omitted because of no outliers
Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up
phenotypes_with_PCs.df <- joined_phenotypes.df[!joined_phenotypes.df$pca_outlier,]
dim(phenotypes_with_PCs.df)
## [1] 528 46
#eigenvectors.mx <- wecare.svd3$u
eigenvectors.mx <- wecare.svd2$u
dim(eigenvectors.mx)
## [1] 528 10
colnames(eigenvectors.mx) <-
c("pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10")
phenotypes_with_PCs.df <- cbind(phenotypes_with_PCs.df, eigenvectors.mx)
dim(phenotypes_with_PCs.df)
## [1] 528 56
colnames(phenotypes_with_PCs.df)
## [1] "family.ID" "sample.ID" "paternal.ID" "maternal.ID" "sex" "affection" "gwas_id" "merged_id" "filter" "cc" "setno" "registry" "family_history" "age_dx" "age_ref" "rstime" "eig1_gwas" "eig2_gwas" "eig3_gwas" "eig4_gwas" "eig5_gwas" "stage" "er" "pr" "hist_cat" "hormone" "chemo_cat" "br_xray" "br_xray_dose" "age_menarche" "age_1st_ftp" "age_menopause" "num_preg" "bmi_age18" "bmi_dx" "bmi_ref" "eig1_wecare" "eig2_wecare" "eig3_wecare" "eig4_wecare" "eig5_wecare" "Batch_ID" "Subject_ID" "Barcode" "danish" "pca_outlier" "pc1" "pc2" "pc3" "pc4" "pc5" "pc6" "pc7" "pc8" "pc9" "pc10"
# Check consistency with the previous eigenvectors from WES
sum(is.na((phenotypes_with_PCs.df$eig1_wecare)))
## [1] 196
plot(phenotypes_with_PCs.df$eig1_wecare,
phenotypes_with_PCs.df$pc1,main="PC1: new vs old WES")
plot(phenotypes_with_PCs.df$eig2_wecare,
phenotypes_with_PCs.df$pc2,main="PC2: new vs old WES")
# Check consistency with the previous eigenvectors from GWAS
sum(is.na((phenotypes_with_PCs.df$eig1_gwas)))
## [1] 196
plot(phenotypes_with_PCs.df$eig1_gwas,
phenotypes_with_PCs.df$pc1,main="PC1: new WES vs GWAs")
plot(phenotypes_with_PCs.df$eig2_gwas,
phenotypes_with_PCs.df$pc2,main="PC2: new WES vs GWAs")
# Clean-up
rm(joined_phenotypes.df, eigenvectors.mx) # vars_not_in_LD1
#somehow close wecare.bed ?
P6_D05 and P5_E09 are clear outliers when projected to 1kgp
sd_threshold <- 6
# PC1 outliers
pc1 <- phenotypes_with_PCs.df$pc1
pc1_mean <- mean(pc1)
pc1_sd <- sd(pc1)
lo_pc1 <- pc1 < pc1_mean - sd_threshold * pc1_sd
hi_pc1 <- pc1 > pc1_mean + sd_threshold * pc1_sd
cat("pc1 lo/hi:",sum(lo_pc1),"/",sum(hi_pc1),"\n")
## pc1 lo/hi: 0 / 0
phenotypes_with_PCs.df$sample.ID[lo_pc1]
## character(0)
phenotypes_with_PCs.df$sample.ID[hi_pc1]
## character(0)
# PC2 outliers
pc2 <- phenotypes_with_PCs.df$pc2
pc2_mean <- mean(pc2)
pc2_sd <- sd(pc2)
lo_pc2 <- pc2 < pc2_mean - sd_threshold * pc2_sd
hi_pc2 <- pc2 > pc2_mean + sd_threshold * pc2_sd
cat("pc2 lo/hi:",sum(lo_pc2),"/",sum(hi_pc2),"\n")
## pc2 lo/hi: 0 / 5
phenotypes_with_PCs.df$sample.ID[lo_pc2]
## character(0)
phenotypes_with_PCs.df$sample.ID[hi_pc2]
## [1] "P1_C05" "P1_C12" "P1_D08" "P5_E02" "P6_D05"
# PC3 outliers
pc3 <- phenotypes_with_PCs.df$pc3
pc3_mean <- mean(pc3)
pc3_sd <- sd(pc3)
lo_pc3 <- pc3 < pc3_mean - sd_threshold * pc3_sd
hi_pc3 <- pc3 > pc3_mean + sd_threshold * pc3_sd
cat("pc3 lo/hi:",sum(lo_pc3),"/",sum(hi_pc3),"\n")
## pc3 lo/hi: 0 / 0
phenotypes_with_PCs.df$sample.ID[lo_pc3]
## character(0)
phenotypes_with_PCs.df$sample.ID[hi_pc3]
## character(0)
# Clean-up
rm(sd_threshold,
pc1, pc1_mean, pc1_sd, lo_pc1, hi_pc1,
pc2, pc2_mean, pc2_sd, lo_pc2, hi_pc2,
pc3, pc3_mean, pc3_sd, lo_pc3, hi_pc3)
plot(wecare.svd2, type = "scores") +
aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
labs(title = NULL, color = "Case")
# + geom_hline(pc2_mean ... - sd_threshold * pc2_sd,
# linetype="dashed", color = "red")
plot(wecare.svd2, type = "scores", scores=3:4) +
aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
labs(title = NULL, color = "Case")
plot(wecare.svd2, type = "scores", scores=5:6) +
aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
labs(title = NULL, color = "Case")
plot(wecare.svd2, type = "scores", scores=7:8) +
aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
labs(title = NULL, color = "Case")
plot(wecare.svd2, type = "scores", scores=9:10) +
aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
labs(title = NULL, color = "Case")
save.image(file.path(data_folder,"s04_calculate_PCs.RData"))
save(phenotypes_with_PCs.df,file=file.path(data_folder,"s04_phenotypes_with_PCs.RData"))
ls()
## [1] "base_folder" "data_folder" "joined_variants.df" "NCORES" "phenotypes_with_PCs.df" "project_folder" "scripts_folder" "wecare.bed" "wecare.svd2"
Sys.time()
## [1] "2021-01-31 14:06:07 GMT"
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2135110 114.1 6058580 323.6 NA 6058580 323.6
## Vcells 4587992 35.1 28599140 218.2 16384 69822102 532.8